1. One dimensional Partial Dependence Plot
#install.packages("randomForest")
#install.packages("pdp")
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(pdp)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
library(reshape2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(randomForestSRC)
##
## randomForestSRC 3.2.3
##
## Type rfsrc.news() to see new features, changes, and bug fixes.
##
##
## Attaching package: 'randomForestSRC'
## The following object is masked from 'package:pdp':
##
## partial
days <- read.csv("Datos/day.csv")
hour <- read.csv("Datos/hour.csv")
days$dteday <- as_date(days$dteday)
days_since <- select(days, workingday, holiday, temp, hum, windspeed, cnt)
days_since$days_since_2011 <- int_length(interval(ymd("2011-01-01"), days$dteday)) / (3600*24)
days_since$SUMMER <- ifelse(days$season == 3, 1, 0)
days_since$FALL <- ifelse(days$season == 4, 1, 0)
days_since$WINTER <- ifelse(days$season == 1, 1, 0)
days_since$MISTY <- ifelse(days$weathersit == 2, 1, 0)
days_since$RAIN <- ifelse(days$weathersit == 3 | days$weathersit == 4, 1, 0)
days_since$temp <- days_since$temp * 47 - 8
days_since$hum <- days_since$hum * 100
days_since$windspeed <- days_since$windspeed * 67
rf <- rfsrc(cnt~., data=days_since)
results <- select(days_since, days_since_2011, temp, hum, windspeed, cnt)
nr <- nrow(days_since)
for(c in names(results)[1:4])
{
for(i in 1:nr){
r <- days_since
r[[c]] <- days_since[[c]][i]
sal <- predict(rf, r)$predicted
results[[c]][i] <- sum(sal) / nr
}
}
p1 = ggplot(days_since, aes(x = days_since_2011, y = results$days_since_2011)) +
geom_line(color = "#9FBFB6") +
ylim(c(0,6000)) +
geom_rug(alpha = 0.7,color = '#D9A282', sides = "b") +
ylab("Prediction") +
xlab("Days since 2011")
p2 = ggplot(days_since, aes(x = temp, y=results$temp)) +
geom_line(color = "#9FBFB6") +
ylim(c(0,6000)) +
geom_rug(alpha = 0.7, color = '#D9A282',sides = "b")+
xlab("Temperature") +ylab("Prediction")
p3 = ggplot(days_since, aes(x = hum, y = results$hum)) +
geom_line(color = "#9FBFB6") +
ylim(c(0,6000)) +
geom_rug(alpha = 0.7, color = '#D9A282',sides = "b") +
xlab("Humidity")+ ylab("Prediction")
p4 = ggplot(days_since, aes(x = windspeed, y = results$windspeed)) +
geom_line(color = "#9FBFB6") +
ylim(c(0,6000)) +
geom_rug(alpha = 0.7, color = '#D9A282',sides = "b") +
xlab("Wind speed") + ylab("Prediction")
subplot(p1, p2, p3, p4, shareY = TRUE, shareX = FALSE, titleX = TRUE)
2. Bidimensional Partial Dependency Plot.
sampled <- sample_n(days_since, 40)
temp <- sampled$temp
hum <- sampled$hum
th <- inner_join(data.frame(temp),data.frame(hum), by=character())
## Warning: Using `by = character()` to perform a cross join was deprecated in dplyr 1.1.0.
## ℹ Please use `cross_join()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
th$p <- 0
for(i in 1:nrow(th)){
r <- days_since
r[["temp"]] <- th[["temp"]][i]
r[["hum"]] <- th[["hum"]][i]
sal <- predict(rf, r)$predicted
th[["p"]][i] <- sum(sal) / nr
}
p = ggplot(th, aes(x = temp, y = hum)) +
geom_tile(aes(fill = p), width = 10, height = 15) +
geom_rug(alpha = 0.01) +
xlab("Temperature") +
ylab("Humidity") +
scale_fill_gradientn(name = "Number of bikes",
colors = c("#9FBFB6", "white", "#D9A282"),
values = scales::rescale(c(10, 50, 90))) +
theme_minimal()
p

3. PDP to explain the price of a house.
set.seed(100)
d <- read.csv("Datos/kc_house_data.csv")
sampled <- sample_n(d, 1000)
sampled <- select(sampled, bedrooms, bathrooms, sqft_living, sqft_lot, floors, yr_built, price)
rf <- rfsrc(price~., data=sampled)
results <- select(sampled, bedrooms, bathrooms, sqft_living, floors, price)
nr <- nrow(sampled)
for(c in names(results)[1:4])
{
for(i in 1:nr){
r <- sampled
r[[c]] <- sampled[[c]][i]
sal <- predict(rf, r)$predicted
results[[c]][i] <- sum(sal) / nr
}
}
plot1 = ggplot(sampled, aes(x = bedrooms, y = results$bedrooms)) +
geom_line(color = '#9FBFB6') +
geom_rug(alpha = 0.7, color = '#D9A282',sides = "b") +
ylab("Prediction") + xlab("Bedrooms")
plot2 = ggplot(sampled, aes(x = bathrooms, y = results$bathrooms)) +
geom_line(color = '#9FBFB6') +
geom_rug(alpha = 0.7, color = '#D9A282',sides = "b") +
xlab("Bathrooms")
plot3 = ggplot(sampled, aes(x = sqft_living, y = results$sqft_living)) +
geom_line(color = '#9FBFB6') +
geom_rug(alpha = 0.7,color = '#D9A282', sides = "b") +
xlab("Sqft Living")
plot4 = ggplot(sampled, aes(x = floors, y = results$floors)) +
geom_line(color = '#9FBFB6') +
geom_rug(alpha = 0.7, color = '#D9A282',sides = "b")+
xlab("Floors")
subplot(plot1, plot2, plot3, plot4, shareX = FALSE, titleX = TRUE)